# Words in a Play


## installing dependencies

Installing [Spacy](https://spacy.io/) (a Python module for natural language processing) and its dependencies is a bit arduous but should work as follows:

In [None]:
!pip install spacy pydracor
#  !pip install spacy-transformers # first check if really required â†’ has many dependencies

afterwards (see [hints on selection of models](https://spacy.io/models)):

In [None]:
!python -m spacy download en_core_web_sm

## load play from DraCor

In [None]:
import pydracor

play = pydracor.Play(play_name = "a-midsummer-night-s-dream")
play.spoken_text()

## tokenise text and detect parts of speech

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

import en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp(play.spoken_text())

## text and parts of speech

In [None]:
print([(w.text, w.pos_, w.lemma_) for w in doc])

## most frequent parts of speech

In [None]:
from collections import Counter

def count_words(doc, word_type):
    cnt = Counter()
    for w in doc:
        if w.pos_ == word_type:
            cnt[w.lemma_] += 1          # better than w.text
    return cnt

def print_top(words, n):
    for w, cnt in words.most_common(n):
        print(cnt, w, sep='\t')

word_types = Counter()
for w in doc:
    word_types[w.pos_] += 1
    
print_top(word_types, 10)

## most frequent words per part of speech

In [None]:
for word_type in ["NOUN", "VERB", "ADJ"]:
    print("---", word_type, "---")
    print_top(count_words(doc, word_type), 10) 