\documentclass{article}[onecolumn]
%%%%%%%%%%%%%%%%%%%%%%

\usepackage{authblk}

\usepackage{natbib,stfloats}
%\usepackage[super,numbers]{natbib}
\usepackage{mathrsfs}
\usepackage{array}
\usepackage{graphicx}
\usepackage{booktabs} % For formal tables
\newcolumntype{H}{>{\setbox0=\hbox\bgroup}c<{\egroup}@{}}

\usepackage{epstopdf}
\usepackage{longtable}


\usepackage{setspace}
\doublespacing


%%%%%%%%%%%%%%%%%
\begin{document}%
%%%%%%%%%%%%%%%%%



\date{}


\title{A data science approach to movies and film director analysis}


\author{Christopher May, Lior Shamir}
\affil{Lawrence Technological University \\ 21000 W Ten Mile Rd., Southfield, MI 48075 USA}

\maketitle


\begin{abstract}


The creation of movies involves a careful process of planning, recording, and editing of the visual content, where the most influential role in the process is performed by the film director. Here we propose a quantitative computer-based analysis of movies and film directors to identify similarities that can indicate on influential links. The method works by first extracting a comprehensive set of numerical image content descriptors from a large number of frames from each movie. Then, the most informative descriptors are selected, and the values of the frames are compared to each other to create a similarity matrix between the movies. The similarity matrix is visualized using a phylogeny to show a network of similarities between movies. Experimental results with a dataset of 104 movies show that the method is able to predict the movie based on a single frame with accuracy of $\sim$74\%, and the similarity analysis tends to cluster movies of the same directors or of the same movie series. These results show that computer analysis is able to analyze similarities between movies, providing a new paradigm of studying and analyzing cinematography.

\end{abstract}



\section{Introduction}
\label{introduction}
 
Cinema is one of the most popular forms of art, with millions and even billions of viewers around the world \citep{mulvey1989visual}. As cinema progressed since the 19th century, different film genres developed, as well as signs and symbols that defined how films are communicated \citep{metz1974film,monaco2000read}. The development of different movie genres led to different ways in which movies can be communicated to the viewers, and film critics as well as non-expert viewers can often tell between different movies, genres, and directors.

Movies and movie genres have high impact on culture, and that impact can be attributed to Hollywood \citep{belton1996movies,giroux2001breaking} as well as non-Hollywood movie industries \citep{hogan2009understanding}. For instance, the Vietnam war and American policy and culture related to it were highly influenced by the movie genre that it sparked, and the way the Vietnam war was communicated through these movies \citep{corrigan1991cinema}. Films made after the war such as ``Full Metal Jacket'' by Stanley Kubrick and ``Apocalypse Now'' by Francis Ford Coppala directly tapped into the mood during the Veitnam War, while the latter also worked in themes motifs from other works like the novella ``Heart of Darkness'' by Joseph Conrad. Another example is the impact of movies on the perception of occupations such as lawyers or accountants through the way these occupations were communicated to the viewers \citep{beard1994popular}. 

The person who has the strongest influence on the outcome of the movie-making activity is the film director \citep{monaco2000read}. The film director makes the set of decisions such as the position of the camera, the lightning, position of the actors, sounds, etc', and also supervises the editing of the movie before its final production. That collection of decisions is noticeable to a knowledgeable viewer, and has substantial impact on the overall visual outcome. While there are other participants who can influence the movie such as the film editors, the directors are very often involved with the entire process of the film creation, including film editing. Although more modern ways of film making allow a group of people to control the process of movie creation collectively \citep{Newman2008}, the ``mainstream'' movie industry is dominated by the traditional approach in which a single person has a substantial impact on the created movie.

In the sense of automatic analysis of video data, substantial work has been done on video analysis in the context of action recognition \citep{soomro2012ucf101,kuehne2011hmdb}, or analysis of video data for surveillance \citep{hsieh2006automatic,brutzer2011evaluation}. That kind of analysis aims at identifying the specific and well-defined activities in the video, rather than the artistic aspects of the movies. 

Previous studies of automatic analysis of movie data include automatic rating of movies based on social media analytics \citep{oghina2012predicting}. Other work focused on recommendation framework for movies \citep{melville2002content,debnath2008feature,jung2012attribute,lamprecht2015improving}. Movies have also been analyzed as networks through actors participating in the same movies \citep{gallos2013imdb}, or critics who review and rate the same movies \citep{fatemi2012empirical}. Social networks have also been widely used as tools to analyze movies without the need to analyze the audio or video data directly \citep{weng2009rolenet}. 


Here we apply image analysis algorithms to analyze and profile movies by their visual content. The algorithms are based on previous work of analyzing visual art, demonstrating the ability of algorithms to identify between different schools of art \citep{shamir2010impressionism}, identify similarities between painters, and profile art history in a manner that is largely in agreement with the analysis of art historians \citep{shamir2012computer}. The quantitative analysis of the visual content also allowed profiling the changes in artistic style over time \citep{burcoff2017computer}, and identified unique styles of certain painters such as Vincent Van Gogh and Jackson Pollock \citep{shamir2012computer,shamir2015makes}.

Since computer algorithms are able to analyze visual art to a certain level, the research question of this study is whether such algorithms are able to analyze the art expressed in the process of movie making. For that purpose, each frame in the movie can be considered a work of art, and the analysis of a large number of frames can be used to profile potential links of similarities or influence between movies. Quantitative analysis of movies can open the door to a new paradigm in cinematography, where movies can be analyzed quantitatively by computers that can measure a wide range of visual cues that reflect the artistic style of the film maker.


\section{Data}
\label{data}

The data used in the experiment are a set of 104 Hollywood films from 24 different directors. The films that were selected are among the most popular films in Hollywood's recent history, made between the years of 1960 and 2017, and had substantial impact on a large and diverse population of viewers \citep{bordwell2003excessively,maltby2003hollywood}. The genre of the movie was also taken into account, although the genre-movie relationship is not necessarily a one-to-many relationship, but in fact a movie can belong in more than one move genre. For instance, some movies can be considered action or drama movies, but at the same time be considered a comedy or horror film. 

In this study the genre was determined based on the primary intent of the movie and target audience, and was categorized into one of six genres: comedy, drama, action, horror, thriller, and western. For example, Evil Dead 2 is a horror film that is also considered to be a comedy, but its main goal and target audience is the horror genre. The complete list of movies, genres and directors are summarized in Table~\ref{movies}.


\scriptsize


% \onecolumn

% \begin{table}[l]
\begin{longtable}[ht]{|l|c|c|c|H}
%\begin{tabular}{|l|c|c|c|c|}
%\begin{tabular}{ll}
\hline
Title & Director & Year & Genre & Resolution \\                                                                                                                                                                                                                                                                                                    \hline
2001   & Stanley Kubrick   & 1968  & SciFi & 720p   \\
The Adventures of Tintin & Steven Spielberg & 2008 & Action & 720p \\
Alien    & Ridley Scott & 1979 & SciFi & 720p \\
Aliens  & James Cameron & 1986 &  SciFi & 720p \\
Alien Covenant & Ridley Scott& 2017 & SciFi & 720p \\
Angles and Demons & Ron Howard & 2009 & Thriller & 720p \\
Ant Man & Peyton Reed & 2015 & SciFi & 720p \\
Apollo 13 & Ron Howard & 1995 & Drama & 720p \\ 
Army of Darkness & Sam Raimi & 1992 & Horror & 720p \\
Avengers & Joss Whedon & 2012 & Action & 720p \\ 
Avengers: Age of Ultron & Joss Whedon & 2015 & Action & 720p \\
The Aviatior & Martin Scorsese & 2004 & Drama & 720p \\
The Birds & Alfred Hitchcock & 1963 & Horror & 720p \\ 
Baby Driver & Edgar Wright & 2017 & Action & 720p \\
Backdraft & Ron Howard & 1991 & Drama & 720p \\ 
A Beautiful Mind & Ron Howard & 2001 & Drama & 720p \\
Blade Runner & Ridley Scott & 1982 & SciFi & 720p \\
Blade Runner 2049 & Denis Villeneuve & 2017 & SciFi & 720p \\
Bottle Rocket & Wes Anderson & 1996 & Comedy & 720p \\
Cape Fear & Martin Scorsese & 1991 & Thriller & 720p \\
Captain America  & Joe Johnson & 2011 & Action & 720p \\
Captain America: Civil War & Joe Russo Anthony  Russo & 2016 & Action & 720p \\
Captain America Winter Soldier & Joe Russo Anthony Russo &  2014 & Action & 720p \\
The Color Purple & Steven Spielberg & 1985 & Drama & 720p \\
Darjeeling Limited & Wes Anderson & 2007 & Comedy & 720p \\
Darkman & Sam Raimi & 1990 & Action & 720p \\
The Da Vinci Code & Ron Howard & 2006 & Thriller & 720p \\
The Departed & Martin Scorsese & 2006 & Drama & 720p \\ 
Drag Me to Hell & Sam Raimi & 2009 & Horror & 720p \\
The Dark Knight Rises & Christopher Nolan & 2012 & Action & 720p \\
The Dark Knight & Christopher Nolan & 2008 & Action & 720p \\
Dr. Strange & Scott Derrikson & 2016 & Action & 720p \\
Dunkirk & Christopher Nolan & 2017 & Drama & 720p \\
ET & Steven Spielberg & 1982 & SciFi & 720p \\
Evil Dead 2013 & Fede Álvarez & 2013 & Horror & 720p \\
Evil Dead & Sam Raimi & 1981 & Horror & 720p \\
Evil Dead 2 & Sam Raimi & 1987 & Horror & 720p \\
Eyes Wide Shut & Stanley Kubrick & 1999 & Thriller & 720p \\
Fantasitc Mr. Fox & Wes Anderson & 2009 & Comedy & 720p \\
Following & Christopher Nolan & 1998 & Thriller & 720p \\
Friday The 13th & Sean S. Cunningham & 1980 & Horror & 720p \\
Friday The 13th Pt. 2 & Steven Miner & 1981 & Horror & 720p \\
Friday The 13th Pt. 3 & Steven Miner & 1982 & Horror & 720p \\
Friday The 13th: The Final Chapter & Josesph Zito & 1985 & Horror & 720p \\
Friday The 13th Pt. 7: The New Blood &  John Carl Buechler & 1988 & Horror & 720p \\
Full Metal Jacket & Stanely Kubrick & 1987 & Drama & 720p \\
Gangs of New York & Martin Scorsese & 2002 & Drama & 720p \\
Grand Budapest Hotel & Wes Anderson & 2014 & Comedy & 720p \\
The Gift & Sam Raimi & 2000 & Thriller & 720p \\
Goodfellas & Martin Scorsese & 1990 & Drama & 720p \\
Guardians of the Galaxy & James Gunn & 2014 & Action & 720p \\
Guardians of the Galaxy Vol. 2 & James Gunn & 2017 & Action & 720p \\
Hateful Eight & Quentin Tarantino & 2015 & Western & 720p \\
Heart of the Sea & Ron Howard & 2015 & Drama & 720p \\
Hotel Chevalier & Wes Anderson & 2007 & Drama & 720p \\
Hot Fuzz & Edgar Wright & 2007 & Comedy & 720p \\
How the Grinch Stole Christmas & Ron Howard & 2000 & Comedy & 720p \\ 
Hugo & Martin Scorsese & 2011 & Drama & 720p \\
Inception & Christopher Nolan & 2010 & Thriller & 720p \\
Incredible Hulk & Louis Leterrier & 2008 & Action & 720p \\
Indiana Jones and The Raiders of the Lost Ark & Steven Spielberg & 1981 & Action & 720p \\ 
Inferno & Ron Howard & 2016 & Thriller & 720p \\
Inglorious Basterds & Quentin Tarantino & 2009 & Action & 720p \\
Insomnia & Christopher Nolan & 2002 & Thriller & 720p \\
Intersteller & Christopher Nolan & 2014 & SciFi & 720p \\
Iron Man & Jon Favreau & 2008 & Action & 720p \\
Iron Man 2 & Jon Favreau & 2010 & Action & 720p \\
Iron Man 3 & Shane Black & 2013 & Action & 720p \\
Jackie Brown & Quentin Tarantino & 1997 & Action & 720p \\
Jaws & Steven Spielberg & 1975 & Horror & 720p \\
Jurassic Park & Steven Spielberg & 1993 & Action & 720p \\ 
Kill Bill Vol. 1 & Quentin Tarantino & 2003 & Action & 720p \\
Kill Bill Vol. 2 & Quentin Tarantino & 2004 & Action & 720p \\
The Life Aquatic with Dr. Zissou & Wes Anderson & 2004 & Comedy & 720p \\
The Martian & Ridley Scott & 2015 & SciFi & 720p \\
Memento & Christopher Nolan & 2001 & Thriller & 720p \\
Moonrise Kingdom & Wes Anderson & 2012 & Comedy & 720p \\
Oz The Great and Powerful & Sam Raimi & 2013 & Action & 720p \\ 
The Prestige & Christopher Nolan & 2006 & Thriller & 720p \\
Psycho & Alfred Hitchcock & 1960 & Horror & 720p \\
Pulp fiction & Quentin Tarantino & 1994 & Action & 720p \\
The Quick and the Dead & Sam Raimi & 1995 & Western & 720p \\
Reservoir Dogs & Quentin Tarantino & 1992 & Action & 720p \\
Royal Tenenbaums & Wes Anderson &  2001 & Comedy & 720p \\
Rush & Ron Howard &  2013 & Drama & 720p \\
Rushmore & Wes Anderson & 1998 & Comedy & 720p \\
Saving Private Ryan & Steven Spielberg & 1998 & Drama & 720p \\
Scott Pilgrim VS The World & Edgar Wright & 2010 & Action & 720p \\
Shindler's List & Steven Spielberg & 1993 & Drama & 720p \\
The Shining & Stanley Kubrick & 1980 & Horror & 720p \\
Shutter Island & Martin Scorsese & 2010 & Horror & 720p \\
Silence & Martin Scorsese & 2016 & Drama & 720p \\
Spiderman & Sam Raimi & 2002 & Action & 720p \\
Spiderman 3 & Sam Raimi & 2007 & Action & 720p \\
Spiderman: Homecoming & Jon Watts & 2017 & Action & 720p \\ 
Shuan Of the Dead & Edgar Wright & 2004 & Comedy & 720p \\ 
The Sugarland Express & Steven Spielberg & 1974 & Drama & 720p \\
Taxi Driver & Martin Scorsese & 1976 & Drama & 720p \\
Thor & Kenneth Branagh &  2011 & Action & 720p \\ 
Thor: The Dark World & Alan Taylor & 2013 & Action & 720p \\
Thor: Ragnarock & Taika Waititi & 2017 & Action & 720p \\
The World's End & Edgar Wright & 2013 & Comedy & 720p \\
Willow & Ron Howard & 1988 & Action & 720p \\
The Wolf of Wall Street & Martin Scorsese & 2013 & Comedy & 720p \\


\hline
%\end{tabular}
\caption{Movies used in the experiment.} \\

\label{movies}
\end{longtable}

% \twocolumn

\normalsize


The films are normalized into 720p so that a machine learning classifier will not by driven by differences of the resolution of the films. Each film was separated into a set of 100 frames, such that the interval between each two frames that were separated was equal throughout the movie. For instance, a 100-minute film is separated into 100 frames such that the interval between each two frames is 60 seconds, and the first frame is extracted at 00:00:60. Each frame is of resolution of 1280$\times$720, and was converted into a Tagged Image File (TIF) format image file.   


\section{Movie analysis method}
\label{method}

The frames extracted from movies as described in Section~\ref{data} were analyzed using an image analysis method that can analyze large and complex image data in a comprehensive manner. The method is based on the Wndchrm feature set \citep{Sha08}, which has been used for a wide variety of tasks requiring comprehensive image analysis \citep{Sha08,orlov2008wnd,shamir2010impressionism,shamir2013automatic,shamir2012automatic,shamir2012computer,kum14,shamir2016distinguishing}.

In particular, it has been used widely to study visual art in a quantitative fashion by applying computational analysis to complex visual content \citep{shamir2010impressionism}. For instance, it showed that the computer analysis of art is largely in agreement with the way art historians view influential links between different schools of European art \citep{shamir2012art}. It was also used to identify features typical to Jackson Pollock \citep{shamir2015makes} and show evidence of mathematical similarities between Jackson Pollock and Vincent von Gogh \citep{shamir2012computer}. Another use of the Wndchrm scheme related to automatic analysis of art is the studying of art perception, showing patterns of differences between abstract expressionism and paintings by children and animals \citep{shamir2016distinguishing}.

Wndchrm computes a large set of 2881 numerical descriptors from each frame. That set of numerical image content descriptors includes various  characteristics of the visual content such as fractals, textures (Haralick, Tamura, Gabor), polynomial decomposition of the pixel intensities (Chebyshev polynomials statistics, Zernike polynomials, Chebyshev Fourier spectral analysis, Radon features), statistics of the pixel intensities (multi-scale histograms, first-four moments), high-contrast features (Prewitt edge statistics, objects statistics, Euler number), the image Gini coefficient \citep{abraham2003new}, and the image entropy. The complete description of the feature set is provided in full details in \citep{Sha08,orlov2008wnd,shamir2010impressionism,shamir2013automatic,shamir2012automatic,shamir2016distinguishing}. The source code of the method is also publicly available \citep{udatascl}.

  
 
To extract more information from each frame, the content descriptors are extracted not only from the raw pixels, but also from transforms of the frame and multi-order transforms \citep{shamir2009evaluation}. The transforms used by the scheme are the Fast Fourier Transform (FFT), Wavelet (Symlet 5, level 1) two-dimensional decomposition of the image, the Chebyshev transform, and the Edge Transform, which is the magnitude component of the image's Prewitt gradient. 

The content descriptors extracted from all transforms and combinations of transforms are the statistics and texture features, which include the first four moments, Haralick textures, multi-scale histograms, Tamura textures, and Radon features. The polynomial decomposition descriptors (Zernike features, Chebyshev statistics, and Chebyshev-Fourier spectral features) are extracted from all transforms, except from the Fourier and Wavelet transforms of the Chebyshev transform, and the Wavelet and Chebyshev transforms of the Fourier transform. The high contrast features (edge statistics, object statistics, and Gabor features) are extracted just from the raw pixels, and not from the transforms.

 To filter non-informative features and weight the features by the information they contain, each of the numerical content descriptors $W_f$ of feature $f$ is assigned with a  Fisher discriminant score \citep{christopher2016pattern}, as shown by Equation~\ref{fisher}

\begin{eqnarray}
\label{fisher}
W_f = & \frac{\sum_{c=1}^N(\overline{T_f}-\overline{T_{f,c}})^2}{\sum_{c=1}^N \sigma^{2}_{f,c}},
\end{eqnarray}

where $W_f$ is the weight of feature {\it f} determined by the Fisher discriminant score of feature {\it f}, {\it N} is the number of movies in the dataset, $\overline{T_f}$ is the mean of the values of feature {\it f} in the entire training set, and $\overline{T_{f,c}}$ and $\sigma_{f,c}^2$ are the mean and variance of the values of feature {\it f} among all training frames of movie {\it c}.

The similarity $M_{I,X}$ between a certain frame $I$ and a certain movie $X$ is determined by the minimum Euclidean distance between the features of frame {\it I} and any of the frames of movie $X$, as shown by Equation~\ref{distance}

\begin{eqnarray}
\label{distance}
M_{I,X} = min_i(\sqrt{\Sigma_f(I_f-X{i,f})^2}),
\end{eqnarray}

where $I_f$ is the value of feature $f$ extracted from the frame $I$, and $X_{i,f}$ is the value of feature $f$ extracted from the frame $i$ of movie $X$. Naturally, when classifying a certain frame the predicted movie is determined by the movie $X$ that has the minimum $M_{I,X}$. The similarity between any pair of movies $X$ and $Y$ is determined by the mean of $M_{I,X}$, such that $I$ is all test set frames of movie $Y$.

Analyzing the similarities between all pairs of movies in the dataset provides a matrix of all similarities between all pairs of movies. The values in the similarity matrix are normalized by dividing all values in each row by the similarity value of the movie to itself, and therefore the diagonal of the similarity matrix is 1. The similarity matrix is then visualized by using a phylogeny. The phylogeny visualization package used in this experiment is Phylip \citep{plotree1989phylip}, used with randomized input order of sequences where 97 is the seed, 10 jumbles, and Equal-Daylight arc optimization \citep{plotree1989phylip}.




\section{Results}
\label{results}

The method described in Section~\ref{method} was applied to the data described in Section~\ref{data}. In the first experiment 10 films from the dataset described in Section~\ref{data} were analyzed such that 90 frames from each movie were used for training and the remaining 10 frames for testing. The experiment was repeated 20 times such that in each run different samples were randomly allocated to the training and test sets.

The results show that the algorithm was able to associate a test frame with its movie based on the other training frames in accuracy of 79\%. Since the dataset of the experiment has 10 movies, random predication would have been expected to correctly associate a frame with its movie in 10\% of the cases. The accuracy of the algorithm is much higher than mere chance, showing that the algorithm is capable of identifying image content that is related to a certain movie. Table~\ref{confusion_matrix1} shows the confusion matrix of the experiment.




\begin{table*}
% \scriptsize
\tiny
\centering % used for centering table
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|} 
\hline
   & 2001 & Baby & Eyes Wide & Full Metal & Hot & Pulp      & Scott     & Shaun of & The       & The World's   \\ [0.5ex] % inserts table
   &        & Driver & Shut         & Jacket     & Fuzz &  Fiction &  Pilgrim & the Dead &  Shining & End  \\ [0.5ex] % inserts table
%heading
\hline % inserts single horizontal line
2001 & 196 & 0 & 0 & 0& 	1 & 	0 &	0 &	0 & 	0  &	3 \\ % inserting body of the table
Baby Driver &	0 &	200 &	0 & 	0 &	0 & 	0 &	0 &	0 &	0 &	0 \\
Eyes Wide Shut & 0 &	0 &	107 & 9 & 	0 & 	0 &	0 & 	0 &	84 &	0 \\
Full Metal Jacket & 0 &	0 &	25 &	146 &	0 &	0 &	0 &	0 &	29 &	0 \\
Hot Fuzz  & 0 &	0 &	0 &	0 &	106 & 	16 &	0 &	0 &	0 &	78 \\
Pulp Fiction  & 0 &	0 &	0 &	0 &	10 & 	174 & 	0 & 	0 &	0 &	16 \\
Scott Pilgrim  & 0 &	0 &	0& 	0& 	0 &	0 & 	200 & 	0 & 	0 &	0 \\ 
Shaun of The Dead  & 0 &	0 & 	0 &	0 &	0 &	0 & 	0 &	200 & 	0 & 	0 \\
The Shining  & 1 &	0 &	57 &	10 &	0 &	0 & 	1 &	0 & 	131 & 	0 \\ 
The Worlds End  & 0 &	0 &	0 &	0 &	62 &	9  & 0 &	0 &  0 & 129 \\

\hline %inserts single line
\end{tabular}

\caption{Confusion matrix of the experiment with 10 different movies} % title of Table
\label{confusion_matrix1}

\end {table*}


The confusion matrix shows that the higher numbers are along the diagonal of the matrix, which is aligned with the classification accuracy and therefore it is expected that most movies are classified correctly. Some movies tend to have a higher confusion between them such as ``The Shinning'' and ``Eyes Wide Shut'', or ``Full Metal Jacket'' and ``Eyes Wide Shut''.

As described in Section~\ref{method}, the algorithm can provide the similarity measures between the different movies. Table~\ref{similarity_matrix1} shows the matrix of similarities between the 10 different movies. Due to the different samples used by the different classes, and also due to the imperfect accuracy of the algorithm, the measured similarity between movie {\it i} and movie {\it j} is not always identical to the measured similarity between movie {\it j} and movie {\it i}, but the values are in most cases close, as the similarity matrix shows.



\begin{table*}
% \scriptsize
\tiny
\centering % used for centering table
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|} 
\hline %inserts double horizontal lines
& 2001 & Baby   & Eyes Wide & Full Metal & Hot  & Pulp    & Scott    & Shaun of & The      & The World's   \\ % [0.5ex] % inserts 
&         & Driver & Shut         & Jacket      & Fuzz & Fiction & Pilgrim & the Dead & Shining & End  \\ % [0.5ex] % inserts 
%heading
\hline % inserts single horizontal line
2001 	       & 1.00 & 	0.28 & 	0.10 & 	0.11 &	0.35 & 	0.35 & 	0.13 & 	0.16 & 	0.10 & 	0.35 \\
Baby Driver 	& 0.28 & 	1.00 & 	0.06 & 0.07& 	0.62& 	0.58& 	0.07 & 	0.12 & 0.06 & 0.60 \\
Eyes Wide Shut 	& 0.10& 	0.07 & 	1.00& 	0.86& 	0.08& 	0.08 & 	0.21 & 0.08 & 0.95 &  0.08 \\
Full Metal Jacket 	& 0.09& 	0.07 &  	0.85& 	1.00& 	0.08& 	0.08 & 	0.20 & 0.07 & 0.87 &  0.08 \\
Hot Fuzz 	& 0.31 & 	0.65 & 	0.07 & 	0.08& 	1.00& 	0.89& 	0.08 & 0.13 & 0.07 &  0.98 \\
Pulp Fiction 	& 0.30 & 	0.60 & 	0.06 & 	0.07& 	0.87& 	1.00& 	0.08 & 0.12 & 0.07 & 0.87 \\
Scott Pilgrim 	& 0.12 & 	0.09 & 	0.24 & 	0.25& 	0.09& 	0.09& 	1.00 & 0.09 & 0.21 & 0.09 \\
Shaun Of the Dead 	& 0.14 & 	0.14& 	0.07& 	0.08& 	0.14& 	0.14 & 0.09 & 1.00 & 0.07 & 0.14 \\ 
The Shining 	& 0.09& 	0.07 & 	0.91 & 	0.85& 	0.08& 	0.08 & 0.20 & 0.07 & 1.00 & 0.08 \\ 
The Worlds End 	& 0.30 & 	0.63 & 0.06 & 0.07 & 0.97 & 0.90 & 0.08 & 0.12 & 0.06 & 1.00 \\
% [1ex] % [1ex] adds vertical space
\hline %inserts single line
\end{tabular}
\caption{Similarity matrix of the 10 movies} % title of Table
\label{similarity_matrix1}
\end {table*}


The similarity matrix is visualized using a phylogeny as described in Section~\ref{method}. Figure~\ref{phylogeny1} shows the phylogeny that visualizes the similarity matrix of Table~\ref{similarity_matrix1}.
      
\begin{figure}[h]
\centering
% \includegraphics[scale=0.45]{figure1.eps}
\caption{Phylogeny of the similarity matrix of Table~\ref{similarity_matrix1}.}
\label{phylogeny1}
\end{figure}


The phylogeny shows that the three movies by Stanley Kubrick (Eyes Wide Shut, Full Metal Jacket, The Shining) were clustered close to each other. The two movies by the director Edgar Wright (The World's End, Hot Fuzz) were also placed close to each other, with a third movie (Baby Driver) placed close to that pair. The movies ``The World's End'' and ``Hot Fuzz'' have several things in common in addition to being made by the same director. The two movies are part of a conscious trilogy developed by the director. Additionally, they also share some of the same actors.

For the movies directed by Kubrick, the three movies span over three different genres (horror, drama, thriller), and also do not share any actors, and are not part of any specific movie series or set. The observation that these movies were clustered together by the algorithm indicates that the algorithm was able to detect some visual cues that reflect the way the films were made. The fact that all films were made by the same director provides an indication that the director has impact on the visual content of the film, and that impact can be identified by the algorithm.

The same methodology was also tested with a dataset of 23 films. The accuracy of automatic association of a frame to a movie was 74\%, which is far higher than the expected mere chance accuracy of $\sim$4.3\%. Figure~\ref{phylogeny2} shows the resulting phylogeny when using a dataset of 23 movies. As the figure shows, the ``Friday the 13th'' films are clearly clustered together, showing that the computer analysis was able to identify that these films are part of the same set, although not all of them were made by the same director. Close to that cluster, three movies are grouped together: The Shining, Eyes Wide Shut, and Full Metal Jacket. These films were directed by Stanley Kubrick, indicating that the computer analysis could detect visual similarities that result from the work of the same director on different films.

\begin{figure*}[h]
\centering
% \includegraphics[scale=0.9]{figure2.eps}
\caption{Phylogeny of the experiment when using a dataset of 23 movies}
\label{phylogeny2}
\end{figure*}

Another small cluster contained the movies ``The World's End'' and ``Hot Fuzz'', which are two films made by the director Edgar Wright, again showing that the similarities between movies as identified by the algorithm are closely linked to the director who made the films. The pair of movies ``Moonrise Kingdom'' and ``Fantastic Mr. Fox'' were made by film director Wes Anderson, and are placed on the same branch, as the two films ``Army of Darkness'' and ``Evil Dead 2''. directed by Sam Raimi. It should be noted that the film ``Evil Dead 2013'' was made by a different director, Fede Alvarez, and is also placed in the phylogeny far from the movie ``Evil Dead 2''.





\begin{figure*}[h]
\centering
% \includegraphics[scale=0.95]{figure3.eps}
\caption{Phylogeny of the experiment when using a dataset of 104 movies}
\label{phylogeny104}
\end{figure*}




Figure~\ref{phylogeny104} displays the phylogeny generated after applying the same methodology to a dataset of 104 movies. The classification accuracy of associating a frame to its movie is $\sim$74\%, which is higher than mere chance accuracy of $\sim$1\%. As the phylogeny shows, horror movies are grouped together at around the top left part of the phylogeny. The movies ``Evil Dead'', ``Evil Dead 2'', and ``Army of Darkness'', are placed close to each other, and all of them were made by the same director -- Sam Raimi. In the lower right nodes, and the node directly above those two shows a cluster of several movies that are part of the Marvel Cinematic Universe (MCU). These movies were made by different directors, but share characters and genre along with a movie franchise.

Some of the movies are loosely grouped in what ``Marvel'', the company that produced the movies, called ``phases''. Basically, a set of movies released around the same time and were considered to share the same plot thread. Two of the three completed phases were clustered together by the algorithm. For instance, Thor, Iron Man, Iron Man 2, The Incredible Hulk, and Captain America are all part of ``phase I'', and these movies are all part of the same cluster. Just one movie from the that ``phase'', The Avengers, was not clustered together with the other Marvel Phase I movies. 

Iron Man 3, Thor the Dark World, Captain America Winter Soldier, Guardians of the Galaxy, Avengers Age of Ultron, and Ant Man are part of ``Phase 2''. With the exception of ``Ant Man'', all other movies of the phase are placed in the same cluster.


For some movies, such as Thor, Iron Man, The Incredible Hulk, and Captain America, there are no shared actors or directors between these movies.  
The two Avengers movies have some common elements such as shared actors. Figure~\ref{phylogeny104} shows tighter grouping of all the Marvel movies. As described in Section~\ref{method}, the frames are analyzed by a collection of general global descriptors, and without applying detection of large objects such as faces, so teh presence or absence of shared actors is not expected to impact the analysis. 

In Figure~\ref{phylogeny104}, in the dense cluster in the upper left, several movies by director Sam Raimi are grouped together (Evil Dead, Evil Dead 2, Darkman, The Gift, and Spiderman). It is important to note that there are two iterations of Spiderman on the chart that are not clustered together.  Spiderman and Spiderman 3 were directed by Sam Raimi and ``Spiderman Homecoming'' was directed by Jon Watts. This shows that the algorithm could separate between movies made by different directors, although the topic of the movie is the same. It could also be due to the fact that Spiderman Homecoming is a MCU film and shares a few actors with the Iron Man films, although those characters are not dominant in the film, and the algorithm uses low level features and does not identify specific actors. 

In the upper branch in the lower left side of Figure~\ref{phylogeny104} there is a cluster of films by Martin Scorsese  (The Aviator, Gangs of New York, Silence and The Departed). 



The analysis is based on a large number of numerical image content descriptors, from which the most informative descriptors are selected by computing the Fisher discriminant of each feature, and then using the features with the highest Fisher discriminant scores as described in Section~\ref{method}. That is, features with higher Fisher discriminant scores are assumed to be more informative compared to features with lower Fisher discriminant scores, and therefore have more impact on the results of the analysis. Figure~\ref{featurs} shows the sums of the Fisher discriminant scores of the features used on the analysis, extracted from the different transforms.

\begin{figure*}[h]
\centering
% \includegraphics[scale=0.45, angle=90]{figure4.eps}
\caption{Fisher discriminant scores of the features used to classify and measure similarities between the movies.}
\label{featurs}
\end{figure*}

The graph shows that numerical image content descriptors computed from the raw pixels of the frames provide relatively little information about the movies compared to numerical content descriptors extracted from the transforms of the frames. For instance, the Zernike, first four moments, and Haralick texture features computed from the Fourier transform of the frames provide substantial information about the movie, as well as other features such as the Tamura textures extracted from the Chebyshev transform. Since these features are extracted from transforms of the frames, they are not intuitive, showing the complex nature of the visual content expressed in modern films.

   

\section{Conclusions}
\label{conclusions}

While movies are one of the most common forms of popular art and culture, little work has yet been done on quantitative analysis of the visual content of movies. Here we used comprehensive quantitative analysis to analyze and compare different movies in a quantitative fashion. The results show that the algorithm is able to associate a frame with the movie it is part of in accuracy far higher than mere chance, indicating the existence of visual consistency in the movies that are detectable by computer algorithms. It also shows that it can analyze and visualize the similarities between movies, in a way that allows to compare the visual content of movies in a quantitative manner.

That comparison showed that in many cases the algorithm grouped together movies that were created by the same director, even in cases where the genre and actors were different. That provides quantitative evidence that movie directors have visual impact on their films in a way that makes them visually similar regardless of their topic or genre.


The study is focused on popular Hollywood movies aiming at larger audiences, rather than artistic movies created for a smaller number of viewers. Also, movies combine the visual information and the audio information of the film's soundtrack. This study is focused on just the visual content of the movie, while it does not analyze the audio data. The results of these experiments provide an indication that the visual content created by film directors can be quantified, introducing a new approach to the studying of film making.


Database sites like IMDB and streaming sites use relevant keywords like genre, actor's names and directors to categorize and recommend movies to their users, but in most cases that information is based on metadata rather than the visual content of the movies. The results shown in in this study provide evidence that movies can be categorized not just by metadata such as actors, year of production, genres, or directors, but also based on the visual content of the films.


The approach used in this study can be further developed into a methodology that can be used to study films, providing new quantitative approaches to studying the history of cinema. Such methods can join similar approaches used to study other fields of human creation such as art and music, in which the use of computers is more prevalent. The source code of the method is publicly available \citep{udatascl}.


%\section*{Acknowledgement}
%The study was funded in part by NSF grant IIS-1546079, and HHMI grant 52008705.  ???????


%\bibliographystyle{apalike}
%\bibliographystyle{plain}
%\bibliographystyle{unsrt}
%\bibliographystyle{ACM-Reference-Format}

%\bibliography{movie_analysis}



\begin{thebibliography}{}
%

\bibitem[Abraham et~al., 2003]{abraham2003new}
Roberto G. Abraham, Sidney Van Den~Bergh, S., and Preethi Nair, 2003.
\newblock ``A new approach to galaxy morphology. i. analysis of the sloan digital
  sky survey early data release,''
\newblock {\em The Astrophysical Journal}, volume 588, number 1, pp. 218.

\bibitem[Beard, 1994]{beard1994popular}
Victoria Beard, 1994.
\newblock ``Popular culture and professional identity: accountants in the movies,''
\newblock {\em Accounting, Organizations and Society}, volume 19,  number 3, pp. 303--318.

\bibitem[Belton, 1996]{belton1996movies}
John Belton, 1996.
\newblock {\em Movies and mass culture}.
\newblock Rutgers University Press.

\bibitem[Bishop, 2016]{christopher2016pattern}
Chris Bishop, 2016.
\newblock {\em Pattern Recognition and Machine Learning}.
\newblock Springer-Verlag New York.

\bibitem[Bordwell et~al., 2003]{bordwell2003excessively}
David Bordwell, Janet Staiger, and Kristin Thompson, 2003.
\newblock An excessively obvious cinema.
\newblock In {\em The Classical Hollywood Cinema}, pages 21--29. Routledge.

\bibitem[Brutzer et~al., 2011]{brutzer2011evaluation}
Sebastian Brutzer, Benjamin H{\"o}ferlin, and Gunther Heidemann, 2011.
\newblock ``Evaluation of background subtraction techniques for video
  surveillance,''
\newblock In {\em IEEE Conference on Computer Vision and Pattern Recognition},
  pp. 1937--1944.

\bibitem[Burcoff and Shamir, 2017]{burcoff2017computer}
Amanda Burcoff, and Lior Shamir, 2017.
\newblock ``Computer analysis of pablo picasso's artistic style,''
\newblock {\em International Journal of Art, Culture and Design Technologies},
volume 6, number 1, pp. 1--18.

\bibitem[Corrigan, 1991]{corrigan1991cinema}
Timothy Corrigan, 1991.
\newblock {\em A cinema without walls: movies and culture after Vietnam}.
\newblock Rutgers University Press.

\bibitem[Debnath et~al., 2008]{debnath2008feature}
Souvik Debnath, Niloy Ganguly, and Pabitra Mitra, 2008.
\newblock ``Feature weighting in content based recommendation system using social
  network analysis,''
\newblock In {\em Proceedings of the 17th International Conference on World
  Wide Web}, pp. 1041--1042.

\bibitem[Fatemi and Tokarchuk, 2012]{fatemi2012empirical}
Maryam Fatemi, and Laurissa Tokarchuk, 2012.
\newblock ``An empirical study on imdb and its communities based on the network
  of co-reviewers,''
\newblock In {\em Proceedings of the First Workshop on Measurement, Privacy,
  and Mobility}, pp.~7.

\bibitem[Gallos et~al., 2013]{gallos2013imdb}
Lazaros Gallos, Fabricio Potiguar, Jos{\'e} S. Andrade~Jr, and Herman A. Makse, 2013.
\newblock ``IMDB network revisited: unveiling fractal and modular properties from
  a typical small-world network,''
\newblock {\em PLoS ONE}, volume 8, number 6, pp. e66443.

\bibitem[Giroux, 2001]{giroux2001breaking}
Henry A. Giroux, 2001.
\newblock {\em Breaking in to the Movies: Film and the Culture of Politics}.
\newblock Wiley-Blackwell.

\bibitem[Hogan, 2009]{hogan2009understanding}
Patrick C. Hogan, 2009.
\newblock {\em Understanding Indian Movies: Culture, Cognition, and Cinematic
  Imagination}.
\newblock University of Texas Press.

\bibitem[Hsieh et~al., 2006]{hsieh2006automatic}
Jun-Wei Hsieh, Shih-Hao Yu, Yung-Sheng Chen, and Wen-Fong Hu, 2006.
\newblock ``Automatic traffic surveillance system for vehicle tracking and
  classification,''
\newblock {\em IEEE Transactions on Intelligent Transportation Systems},
  volume 7, number 2, pp. 175--187.

\bibitem[Jung, 2012]{jung2012attribute}
Jason, J. Jung, 2012.
\newblock ``Attribute selection-based recommendation framework for short-head
  user group: An empirical study by movielens and IMDB,''
\newblock {\em Expert Systems with Applications}, volume 39, number 4, pp. 4049--4054.

\bibitem[Kuehne et~al., 2011]{kuehne2011hmdb}
Hildegard Kuehne, Hueihan Jhuang, Est{\'\i}baliz Garrote, Tomaso Poggio, and Thomas Serre, 2011.
\newblock ``HMDB: a large video database for human motion recognition,''
\newblock In {\em IEEE International Conference on Computer Vision}, pp. 2556--2563. 

\bibitem[Kuminski et~al., 2014]{kum14}
Evan Kuminski, Joe George, John Wallin, and Lior Shamir, 2014.
\newblock ``Combining human and machine learning for morphological analysis of
  galaxy images,''
\newblock {\em Publications of the Astronomical Society of the Pacific},
 volume 126, number 944, pp. 959--967.

\bibitem[Lamprecht et~al., 2015]{lamprecht2015improving}
Daniel Lamprecht, Florian Geigl, Tomas Karas, Simon Walk, Denis Helic, D., and Markus Strohmaier, 2015.
\newblock ``Improving recommender system navigability through diversification: A
  case study of IMDB,''
\newblock In {\em Proceedings of the 15th International Conference on Knowledge
  Technologies and Data-driven Business}, pp~21.

\bibitem[Maltby, 2003]{maltby2003hollywood}
Richard G. Maltby, 2003.
\newblock {\em Hollywood Cinema}.
\newblock Blackwell Publishing.

\bibitem[Melville et~al., 2002]{melville2002content}
Prem Melville, Raymond J. Mooney, and Ramadass Nagarajan, 2002.
\newblock ``Content-boosted collaborative filtering for improved recommendations,''
\newblock In {\em In Proceedings of the Eighteenth National Conference on
  Artificial Intelligence}.

\bibitem[Metz, 1974]{metz1974film}
Christian Metz, 1974.
\newblock {\em Film language: A semiotics of the cinema}.
\newblock University of Chicago Press.

\bibitem[Monaco and Lindroth, 2000]{monaco2000read}
James Monaco, and David Lindroth, 2000.
\newblock {\em How to read a film: the world of movies, media, and multimedia:
  language, history, theory}.
\newblock Oxford University Press, USA.

\bibitem[Mulvey, 1989]{mulvey1989visual}
Laura Mulvey, 1989.
\newblock ``Visual pleasure and narrative cinema,''
\newblock In {\em Visual and other pleasures}, pp. 14--26. Springer.

\bibitem[Newman, 2008]{Newman2008}
Michael Newman, 1989.
\newblock ``Ze Frank and the poetics of Web video,''
\newblock {\em First Monday}, volume 13, number 5.

\bibitem[Oghina et~al., 2012]{oghina2012predicting}
Andrei Oghina, Mathias Breuss, Manos Tsagkias, and Maarten de~Rijke, 2012.
\newblock Predicting IMDB movie ratings using social media.
\newblock In {\em European Conference on Information Retrieval}, pp. 503--507. Springer.

\bibitem[Orlov et~al., 2008]{orlov2008wnd}
Nikita Orlov, Lior Shamir, Tomasz Macura, Josiah Johnston, David M. Eckley, and Ilya Goldberg, 2008.
\newblock ``Wnd-charm: Multi-purpose image classification using compound image
  transforms,''
\newblock {\em Pattern Recognition Letters}, volume 29, number 11, pp. 1684--1693.

\bibitem[Plotree and Plotgram, 1989]{plotree1989phylip}
Joseph Felsenstein, 1989.
\newblock Phylip-phylogeny inference package (version 3.2).
\newblock {\em Cladistics}, 5(163):6.

\bibitem[Shamir, 2012a]{shamir2012automatic}
Lior Shamir, 2012a.
\newblock ``Automatic detection of peculiar galaxies in large datasets of galaxy
  images,''
\newblock {\em Journal of Computational Science}, volume 3, number 3, pp. 181--189.

\bibitem[Shamir, 2012b]{shamir2012computer}
Lior Shamir, 2012b.
\newblock ``Computer analysis reveals similarities between the artistic styles of
  van gogh and pollock,''
\newblock {\em Leonardo}, volume 45, number 2, pp. 149--154.

\bibitem[Shamir, 2015]{shamir2015makes}
Lior Shamir, 2015.
\newblock ``What makes a pollock pollock: a machine vision approach,''
\newblock {\em International Journal of Arts and Technology}, volume 8, number 1, pp. 1--10.

\bibitem[Shamir, 2017]{udatascl}
Lior Shamir, 2017.
\newblock ``UDAT: A multi-purpose data analysis tool,''
\newblock Astrophysics Source Code Library. pp. ascl:1704.002.

\bibitem[Shamir et~al., 2013]{shamir2013automatic}
Lior Shamir, Anthony Holincheck, and John Wallin, 2013.
\newblock ``Automatic quantitative morphological analysis of interacting
  galaxies,''
\newblock {\em Astronomy and Computing}, volume 2, pp. 67--73.

\bibitem[Shamir et~al., 2010]{shamir2010impressionism}
Lior Shamir, Tomasz Macura, Nikita Orlov, David M. Eckley, and Ilya G. Goldberg, 2010.
\newblock ``Impressionism, expressionism, surrealism: Automated recognition of
  painters and schools of art,''
\newblock {\em ACM Transactions on Applied Perception}, volume 7, number 2, pp. 8.

\bibitem[Shamir et~al., 2016]{shamir2016distinguishing}
Lior Shamir, Jenny Nissel, and Ellen Winner, 2016.
\newblock ``Distinguishing between abstract art by artists vs. children and
  animals: Comparison between human and machine perception,''
\newblock {\em ACM Transactions on Applied Perception}, volume 13, number 3, pp. 17.

\bibitem[Shamir et~al., 2008]{Sha08}
Lior Shamir, Nikita Orlov, David M. Eckley, Tomasz Macura, Josiah Johnston, and Ilya G. Goldberg, 2008.
\newblock ``Wndchrm -- an open source utility for biological image analysis,''
\newblock {\em Source Code for Biology and Medicine}, volume 3, pp. 13.

\bibitem[Shamir et~al., 2009]{shamir2009evaluation}
Lior Shamir, Nikita Orlov, and Ilya G. Goldberg, 2009.
\newblock ``Evaluation of the informativeness of multi-order image transforms,''
\newblock In {\em IPCV}, pp. 37--42.

\bibitem[Shamir and Tarakhovsky, 2012]{shamir2012art}
Lior Shamir and Jane A. Tarakhovsky, 2012.
\newblock ``Computer analysis of art,''
\newblock {\em Journal on Computing and Cultural Heritage}, volume 5, number 2, pp. 7.

\bibitem[Soomro et~al., 2012]{soomro2012ucf101}
Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah, 2012.
\newblock ``UCF101: A dataset of 101 human actions classes from videos in the wild,''
\newblock {\em arXiv preprint arXiv:1212.0402}.

\bibitem[Weng et~al., 2009]{weng2009rolenet}
Chung-Yi Weng, Wei-Ta Chu, and Ja-Ling Wu, 2009.
\newblock ``Rolenet: Movie analysis from the perspective of social networks,''
\newblock {\em IEEE Transactions on Multimedia}, volume 11, number 2, pp. 256--271.

%
\end{thebibliography}

\end{document}



