This Markdown file is the first part of this analysis.

I use a unique dataset that contains information on 47.006 Airbnb listings from seven major German cities, namely Berlin, Munich, Hamburg, Cologne, Dresden, Stuttgart and Frankfurt am Main. Listings were gathered directly from Airbnb’s website in September 2017 using a custom web scraper. The dataset includes all publicly available information for a listing, including but not limited to prices, accommodation features, reviews and host details.

Data Preparations

print(paste0("Number of rows: ", dim(rooms)[1]))

## [1] "Number of rows: 47006"

print(paste0("Number of columns: ", dim(rooms)[2]))

## [1] "Number of columns: 62"

str(rooms)

## Classes 'tbl_df', 'tbl' and 'data.frame':    47006 obs. of  62 variables:
##  $ room_id                   : int  19117409 5728058 19954984 9918551 13836114 20355318 18732461 12021779 18019626 20121368 ...
##  $ host_id                   : int  133588182 333588 140968262 50992051 81617924 80225160 49157795 7901771 2307050 20759906 ...
##  $ room_type                 : chr  "Entire home/apt" "Entire home/apt" "Entire home/apt" "Entire home/apt" ...
##  $ country                   : chr  "Deutschland" "Deutschland" "Deutschland" "Deutschland" ...
##  $ city                      : chr  "Hamburg" "Hamburg" "München" "Schönefeld" ...
##  $ neighborhood              : chr  NA NA NA NA ...
##  $ address                   : chr  "Othmarschen, Hamburg" "Neustadt, Hamburg" "Schwabing - West, München" "Schönefeld" ...
##  $ price                     : int  129 116 91 43 61 49 120 120 145 91 ...
##  $ nightly_price             : int  129 116 91 43 61 49 120 120 145 91 ...
##  $ reviews                   : int  3 24 10 0 13 1 10 11 4 1 ...
##  $ accommodates              : int  2 2 6 1 2 2 5 6 5 4 ...
##  $ bathrooms                 : int  1 1 1 1 1 1 1 1 2 1 ...
##  $ bedrooms                  : int  1 1 2 0 1 1 3 2 3 1 ...
##  $ bed_type                  : chr  "Real Bed" "Real Bed" "Real Bed" "Real Bed" ...
##  $ minstay                   : int  2 3 2 3 1 2 2 2 6 2 ...
##  $ last_modified             : POSIXct, format: "2017-09-27 08:47:10" "2017-09-27 08:47:27" ...
##  $ latitude                  : num  53.6 53.6 48.2 52.4 53.6 ...
##  $ longitude                 : num  9.9 9.98 11.56 13.44 9.98 ...
##  $ survey_id                 : int  7 7 2 1 7 3 7 2 1 2 ...
##  $ location                  : chr  NA NA NA NA ...
##  $ coworker_hosted           : chr  NA NA NA NA ...
##  $ extra_host_languages      : chr  "{en}" "{en}" "{en}" "{en,fr}" ...
##  $ name                      : chr  "Komfortable Erdgeschosswohnung mit Südterrasse." "Cozy city apartment - very central" "EmiLi - Helle, gemütliche Wohnung in bester Lage" "Einliegerwohnung auf dem Mauerweg" ...
##  $ property_type             : chr  "Wohnung" "Wohnung" "Wohnung" "Bed & Breakfast" ...
##  $ currency                  : chr  "EUR" "EUR" "EUR" "EUR" ...
##  $ rate_type                 : chr  "nightly" "nightly" "nightly" "nightly" ...
##  $ overall_satisfaction      : chr  "100" "96" "100" NA ...
##  $ cleanliness_satisfaction  : int  10 10 10 NA 10 10 10 9 10 8 ...
##  $ communication_satisfaction: int  10 10 10 NA 10 10 10 9 10 6 ...
##  $ location_satisfaction     : int  10 10 10 NA 10 10 10 9 10 8 ...
##  $ accuracy_satisfaction     : int  9 10 10 NA 10 10 10 9 10 10 ...
##  $ checkin_satisfaction      : int  10 10 10 NA 10 10 10 10 10 6 ...
##  $ value_satisfaction        : chr  "10" "10" "10" NA ...
##  $ amenities                 : chr  "{128,1,129,4,8,9,21,91,92,93,30,94,31,95,96,33,98,35,99,100,101,40,41,44,110,111,112,113,50,115,116,120,57,121,61,127}" "{1,49,50,35,8,40,28,44,45,30,46}" "{33,129,35,4,38,8,40,73,42,44,45,46,47,28,61,30}" "{33,34,35,4,37,38,39,8,40,9,41,44,45,46,47,16,49,28,30,31}" ...
##  $ cancel_policy             : chr  "4" "5" "3" "3" ...
##  $ instant_book              : chr  "false" "false" "true" "false" ...
##  $ response_time             : chr  "51118" "1000" "1" "28566" ...
##  $ response_rate             : num  1 1 1 1 1 0.5 1 1 1 1 ...
##  $ friend_count              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ wishlist_count            : int  14 90 27 17 26 0 43 73 20 4 ...
##  $ pic_count                 : chr  "12" "4" "7" "5" ...
##  $ superhost                 : chr  "false" "false" "false" "false" ...
##  $ description_language      : chr  "de" "de" "de" "de" ...
##  $ hostname                  : chr  "Michael" "Nana" "Lina &amp; Emily" "Liliana" ...
##  $ rule_children             : chr  "true" "false" "true" "true" ...
##  $ rule_infants              : chr  "false" "false" "false" "true" ...
##  $ rule_pets                 : chr  "false" "false" "false" "false" ...
##  $ rule_smoking              : chr  "false" "false" "false" "false" ...
##  $ rule_events               : chr  "false" "false" "false" "false" ...
##  $ hostprofilepic            : chr  "https://a0.muscache.com/im/pictures/7e75a61b-5240-4867-b496-f7efdb564053.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/users/333588/profile_pic/1406487683/original.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/pictures/02b39cd9-1fd4-498e-b830-203f11919ee2.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/pictures/46deaa24-5700-45ed-b0ee-7a81b552da7f.jpg?aki_policy=profile_x_medium" ...
##  $ cleaning_fee              : chr  "20" NA NA NA ...
##  $ security_deposit          : chr  NA NA NA NA ...
##  $ last_review               : POSIXct, format: "2017-09-09 13:37:58" "2017-06-18 11:33:06" ...
##  $ positive_reviews          : POSIXct, format: NA NA ...
##  $ negative_reviews          : Date, format: NA NA ...
##  $ last_cal_update           : chr  "2017-06-22" "2017-09-18" "2017-09-04" "2017-09-20" ...
##  $ member_since              : chr  "Juni 2017" "Januar 2011" "Juli 2017" "Dezember 2015" ...
##  $ host_verified             : chr  "TRUE" "TRUE" "FALSE" "FALSE" ...
##  $ deleted                   : chr  "0" "0" "0" "0" ...
##  $ filled                    : chr  "TRUE" "TRUE" "TRUE" "TRUE" ...
##  $ description               : chr  "Die 80 qm große Wohnung ist im Erdgeschoß gelegen und sehr gut ausgestattet. Es gibt eine moderne Küche mit Ess"| __truncated__ "Bright, quiet, fully furnished, in the middle of Hamburg – great central suburb „Neustadt“. Fully equipped + li"| __truncated__ "Super schöne, sehr helle Wohnung. Stilvoll und mit viel Liebe eingerichtet. In top Lage!  Karstadt, Rewe, Lidl,"| __truncated__ "- sehr ruhige Lage im Süden Berlins; 150 m zum Bus - besteht aus einem Zimmer (23,10 qm) mit integrierter Küche"| __truncated__ ...
##  $ base_price                : chr  NA NA NA NA ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 62
##   .. ..$ room_id                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ host_id                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ room_type                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ country                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ city                      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ neighborhood              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ address                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ price                     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ nightly_price             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ reviews                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ accommodates              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bathrooms                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bedrooms                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bed_type                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ minstay                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ last_modified             :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
##   .. ..$ latitude                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ longitude                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ survey_id                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ location                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ coworker_hosted           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ extra_host_languages      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ name                      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ property_type             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ currency                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ rate_type                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ overall_satisfaction      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ cleanliness_satisfaction  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ communication_satisfaction: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ location_satisfaction     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ accuracy_satisfaction     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ checkin_satisfaction      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ value_satisfaction        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ amenities                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ cancel_policy             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ instant_book              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ response_time             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ response_rate             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ friend_count              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ wishlist_count            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ pic_count                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ superhost                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ description_language      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ hostname                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ rule_children             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ rule_infants              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ rule_pets                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ rule_smoking              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ rule_events               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ hostprofilepic            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ cleaning_fee              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ security_deposit          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ last_review               :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
##   .. ..$ positive_reviews          :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
##   .. ..$ negative_reviews          :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ last_cal_update           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ member_since              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ host_verified             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ deleted                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ filled                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ description               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ base_price                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

# Convert strings to numeric
rooms <- rooms %>% 
  mutate(overall_satisfaction = as.numeric(overall_satisfaction),
         pic_count = as.numeric(pic_count)) %>%
  filter(!is.na(overall_satisfaction))

(1) Cities

Keep listings from the following cities: Hamburg, München, hamburg, Köln, FFM, Dresden, Stuttgart

## create clean-up function
create_city <- function(x, city){
  city_clean <- ifelse(grepl(x, city),x , city) 
  return(city_clean)
}

city_list <- c("Hamburg","München","Berlin","Frankfurt","Köln","Stuttgart","Dresden")

for(i in city_list){
  rooms$city <- create_city(i, rooms$city)
}

rooms %>%
  filter(city %in% city_list) -> rooms

rooms %>%
  group_by(city) %>%
  tally() %>%
  ggplot(aes(reorder(city, n, desc),n)) +
  geom_col(fill = col[3], alpha = 0.8) +
  labs(x="", y="", title="Count")

(2) Property Type

rooms %>%
  group_by(property_type) %>%
  tally() %>%
  ggplot(aes(reorder(property_type, n),n)) +
  geom_col(fill = col[3], alpha = 0.8) +
  labs(x="", y="", title="Property Types") +
  coord_flip()

To keep things simple, I will just keep listings of property type “Wohnung” (apartment)

rooms %>%
  filter(property_type == "Wohnung") -> rooms

(3) Roomtype

rooms %>%
  ggplot(aes(room_type)) +
  geom_bar(fill = col[3], alpha = 0.8) +
  labs(x="", y="")

(4) Price

rooms %>%
  ggplot(aes(city, price)) +
  geom_boxplot(outlier.size = 0)

Apparently, there are some outliers. After cheking the respective listings, I decided to exclude them.

rooms %>%
  filter(price < 1500) -> rooms

rooms$price.cut <- cut(rooms$price, c(seq(0,500,1), Inf))

rooms %>%
  ggplot(aes(as.numeric(price.cut), factor(city))) +
  geom_density_ridges(scale = 5,
                      fill = col[3], alpha = 0.7,
                      color = "white") +
  theme_ridges() +
  scale_x_continuous(expand = c(0, 0), labels = c(seq(0,400,100),">500")) +
  labs(y="", x="Price")

(5) Rating

rooms %>%
  ggplot(aes(overall_satisfaction, factor(room_type))) +
  geom_density_ridges(scale = 5,
                      fill = col[3], alpha = 0.7,
                      color = "white") +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y="", x="Rating")

(6) Number of Reviews

Next, I exclude listings with less than three reviews, as it can be assumed that these listings have never been booked, or only very little.

rooms %>% 
  filter(reviews >= 3) -> rooms

rooms$reviews.cut <- cut(rooms$reviews, c(seq(0,50,1), Inf))

rooms %>%
  ggplot(aes(as.numeric(reviews.cut), factor(city))) +
  geom_density_ridges(scale = 5,
                      fill = col[3], alpha = 0.7,
                      color = "white") +
  scale_y_discrete(expand = c(0,0)) +
  scale_x_continuous(expand = c(0,0),
                     breaks = c(seq(0,50,10)),
                     labels = c(seq(0,40,10),">50")) +
  labs(y="", x="Number of Reviews")

Final dataframe

df <- rooms %>% 
  select(room_id, name, 
         description, city, price, overall_satisfaction,
         room_type, bed_type, pic_count,
         reviews, accommodates, bedrooms, minstay,
         latitude, longitude) %>%
  mutate(fulltext = paste(name, description, sep=" "))

Textdata

Turning to the text data, lets first have a quick look at three random descriptions:

rooms %>% sample_n(3) %>%
  select(description) %>%
  knitr::kable(align = "l")

description
Meine Unterkunft ist in der Nähe von Neuperlach süd S bahn und u bahn Bus direkt vor der haustür Einkaufsmöglichkeiten 5 min fussweg Park vor der haustür Naherholungsgebiet in neubiberg. Du wirst meine Unterkunft lieben wegen Weil sie sehr gemütlich ist. Der Ausblick, die Lage, die Leute und die Umgebung sind toll. Meine Unterkunft ist gut für paare, alleinreisende abenteurer und geschäftsreisende.
Sunny balcony. Quiet. 4th floor. Completely renovated and still feels brand new! This is a vibrant, clean and safe area of Berlin. Brilliant connections to transport (S + U + Tram at Schönhauser Allee) to get easily anywhere in Berlin, including the airports. English/ Francais/ Español/ Deutsch
Schöne 1,5 Zimmer Wohnung in München Trudering. Einkaufsmöglichkeiten sind fussläufig erreichbar. Eine Bushaltestelle ist 3 min entfernt, dort fährt die 146 und 192. U-und S-Bahnhof Richtung Innenstadt, ebenfalls fussläufig erreichbar. Die Wohnung enthält 1 Wohnzimmer und ein abgetrennten Schlafbereich, 1 Badezimmer mit Badewanne und 1 Küche mit Spühlmaschine.

Languages

In which languages are the descriptions written?

load(file = "../output/prep1.Rda")

df %>% group_by(language) %>% 
  tally() %>%
  ggplot(aes(reorder(language, n),n)) +
  geom_col(fill = col[3], alpha = 0.7) +
  coord_flip() +
  labs(x="",y="")

Check sample articles if the classification is valid

df %>%
  sample_n(5) %>%
  select(fulltext, language) %>%
  knitr::kable()

fulltext	language
Nettes Apartment - zentral gelegen Die Wohnung befindet sich im schönen Stadtteil Au-Haidhausen. Mit den öffentlichen Verkehrsmitteln kann man binnen weniger Minuten den Hauptbahnhof oder den Marienplatz erreichen. Außerdem ist die Isar nur wenige Gehminuten entfernt. The appartment is located in the middle of Munic. It is a 5 minute walk to the subway and bus. From there you reach the central trainstation in 5 minutes. It is about 20 minutes to the OKTOBERFEST.	german
helle, moderne Wohnung im grünen Winterhude Privatwohnung in Winterhude. Die U Bahn und Bushaltestelle Borgweg ist in wenigen Gehminuten zu erreichen. Der Stadtpark ist ebenfalls fußläufig zu erreichen. Zu der Wohnung: Die Wohnung ist ca. 54 qm groß und gerade neu eingerichtet worden. Die großen Fenster in allen Räumen lassen die Wohnung sehr hell und freundlich wirken. Die Küche ist voll ausgestattet. Das Badezimmer beinhaltet eine Waschmaschine, die ebenfalls benutzt werden kann.	german
Beautiful quiet apartment in Friedrichshain I`m renting my beautiful furnished 90 sqm 3 room apartment in Berlin Friedrichshain between Ostkreuz and Elsenbrücke. Huge loft style living room with open fully equipped kitchen with induction cooktop, bathroom with tub, rain shower and washing machine and nice balcony in the backhouse on the 2nd floor. Perfect for 4 people.	english
Kreuzberg- in the heart of Berlin4 Die Wohnung liegt in der 3. Etage ohne Fahrstuhl Die Wohnung ist mit allem ausgestattet, um nach erlebnisreichen Tagen und Nächten neue Energie zu tanken. In der Küche finden Sie Geschirr, Töpfen , sowie Kaffeemaschine, Toaster, Wasserkocher, Kühlschrank und Induktionsherd. Das Bad ist mit Waschtisch, Badewanne,Dusche und WC ausgestattet.	german
1-Zimmer Wohnung in Maxvorstadt Ich vermiete mein schönes Zimmer in Bestlage Maxvorstadt. Meine Tiefgarage direkt unter der Wohnung kann auch genutzt werden, Auto darf aber nicht zu groß sein Bestlage Maxvorstadt. Der Elisabethmarkt, ein REWE, Cafes, ein Getränkemarkt, ein Späti(!!).. sind direkt um die Ecke. Die Universitäten LMU und TU sowie die meisten Restaurants, Cafes etc. die Maxvorstadt so zu bieten hat sind fußläufig in unter 10 Minuten erreichbar. Tram und Ubahn in 5 Minuten	german

Ok, looks good. Lets only keep listings with german and english descriptions.

df %>%
  filter(language %in% c("german","english")) -> df

ggplot(df, aes(x=factor(city))) +
  geom_bar(aes(fill = language),
           alpha = 0.8) +
  labs(x="", y="", fill="")

It is not surprising that Berlin seems to be the most international city, measured by the listings that have their description in English. But I am a little disappointed with Hamburg…

Word count

How long are the descriptions on average?

df$text_length <- sapply(gregexpr("\\S+", df$fulltext), length)

df$text_length.cut <- cut(df$text_length, c(seq(0,150,1),Inf))

df %>%
  ggplot(aes(as.numeric(text_length.cut), factor(city))) +
  geom_density_ridges(aes(fill = language),
                      color = "white", alpha = 0.8) +
  scale_x_continuous(expand = c(0,0), 
                     labels = c(seq(0,100,50),">150")) +
  labs(y = "", x = "Word Count", fill= "") +
  theme()

Surprisingly, the English texts are longer.

Pre-Processsing

Next, I have to pre-process the text data to be able to include it into my model. Text data is inherently high-dimensional, so to reduce this dimensionality the following steps will be applied:

Remove Punctuation, Numbers,…
Stopword removal: Stopwords (highly frequent terms like “and”, “or”, “the”) are stripped out of text as they do add any helpfull information about the listing.
Tokenization: splitting of a raw character string into individual elements of interest: words, numbers, punctuation.
Document Term Matrix Represent each listing as a numerical array of unique terms (bag-of-words model). This will be done in part three of this project.

(1) Remove Punctuation, Numbers, …

df$text_cleaned <- gsub("[[:punct:]]", " ", df$fulltext)
df$text_cleaned <- gsub("[[:cntrl:]]", " ", df$text_cleaned)
df$text_cleaned <- gsub("[[:digit:]]", " ", df$text_cleaned)
df$text_cleaned <- gsub("^[[:space:]]+", " ", df$text_cleaned)
df$text_cleaned <- gsub("[[:space:]]+$", " ", df$text_cleaned)
df$text_cleaned <- tolower(df$text_cleaned)

(2) Remove Stopwords

df$text_cleaned <- removeWords(df$text_cleaned, stopwords("english"))
df$text_cleaned <- removeWords(df$text_cleaned, stopwords("german"))

(3) Tokenizing

Unigrams

token.df <- df %>%
  tidytext::unnest_tokens(word, text_cleaned) %>%
  filter(nchar(word) > 1) %>%
  filter(nchar(word) < 30)

token.df %>% 
  count(word, sort = TRUE) %>%
  ungroup() %>%
  top_n(20, n) %>%
  knitr::kable(align="l")

word	n
wohnung	12264
apartment	9732
zimmer	8800
room	8529
min	8365
berlin	5994
bahn	5187
restaurants	4511
minuten	4289
flat	4200
küche	3877
city	3862
nähe	3800
unterkunft	3488
bars	3228
qm	3060
direkt	2992
liegt	2983
station	2955
lage	2916

Bigrams

bigram.df <- df %>%
  unnest_tokens(bigram, text_cleaned, 
                          token = "ngrams", n=2) 

bigram.df %>% 
  count(bigram, sort = TRUE) %>%
  ungroup() %>%
  top_n(20, n) %>%
  knitr::kable(align="l")

bigram	n
u bahn	2699
s bahn	1870
zimmer wohnung	1497
wohnung liegt	1287
prenzlauer berg	1083
living room	1081
city center	989
walking distance	982
unterkunft gut	936
bars restaurants	891
paare alleinreisende	848
gut paare	832
unterkunft nähe	811
restaurants bars	786
alleinreisende abenteurer	771
wohnung befindet	751
unmittelbarer nähe	745
unterkunft lieben	733
st pauli	689
lieben wegen	678

Wordclouds

corp <- corpus(df$text_cleaned)
docvars(corp)<-df$city   #attaching the class labels to the corpus message text

col <- RColorBrewer::brewer.pal(10, "BrBG")

(1) Berlin

c.plot <- corpus_subset(corp, docvar1=="Berlin")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))

textplot_wordcloud(c.plot, min.freq = 250, color = col)

(2) Hamburg

c.plot <- corpus_subset(corp, docvar1=="Hamburg")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))

textplot_wordcloud(c.plot, min.freq = 200, color = col)

(3) München

c.plot <- corpus_subset(corp, docvar1=="München")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))

textplot_wordcloud(c.plot, min.freq = 50, color = col)

(4) Köln

c.plot <- corpus_subset(corp, docvar1=="Köln")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))

textplot_wordcloud(c.plot, min.freq = 50, color = col)

(5) Frankfurt

c.plot <- corpus_subset(corp, docvar1=="Frankfurt")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))

textplot_wordcloud(c.plot, min.freq = 50, color = col)

(6) Stuttgart

c.plot <- corpus_subset(corp, docvar1=="Stuttgart")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))

textplot_wordcloud(c.plot, min.freq = 50, color = col)

(7) Dresden

c.plot <- corpus_subset(corp, docvar1=="Dresden")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))

textplot_wordcloud(c.plot, min.freq = 50, color = col)

Go to Part 2: or go back to the overview

Structured vs. Text Data to predict Airbnb prices

Part 1: Explore and prepare Data