/** File "MaxWordsKey2.scala" by KWR for CSE250, Spring 2022. * Assignment 2, second key using for-loops and variables */ import io.StdIn._ import io.Source import java.io.File //technically not needed import java.io.FileWriter //makes it easy to append import java.io.PrintWriter //makes "print" and "println" available object MaxWordsKey2 extends App { val inputFile = if (args.length >= 1) args(0) else "words.txt" val dictFile = "../wordsDWYL.txt" var acc = "" //The reason why this is needed here, not below next to "tokenbize", is INSANE val matrix = Source.fromFile(inputFile).getLines().map(line => tokenize(explode(line))).toArray //val dict = Source.fromFile(dictFile).getLines().toSet val dict = Source.fromFile(dictFile).getLines().map(_.toLowerCase).toSet println("Read " + dict.size + " words into the dictionary") def explode(str: String): List[Char] = str.headOption match { case None => Nil case Some(c) => c :: explode(str.tail) } /** Convert to list with alphanumeric plus ' tokens, else singleton items INV: global accumulator variable acc is an alphanumeric string */ //var acc = "" def tokenize(charList: List[Char]): List[String] = { //var acc = "" return charList match { case Nil => if (acc == "") Nil else { val tmp = acc acc = "" tmp :: Nil } case x :: xs => if (x.isLetterOrDigit || x == '\'') { acc += x tokenize(xs) } else if (x.isWhitespace) { if (acc == "") { tokenize(xs) } else { val tmp = acc //kludgey but needed here acc = "" tmp :: tokenize(xs) } } else { if (acc == "") { (""+x) :: tokenize(xs) } else { val tmp = acc //kludgey but needed here too acc = "" tmp :: (""+x) :: tokenize(xs) } } } } //val matrix = Source.fromFile(inputFile).getLines().map(line => tokenize(explode(line))).toArray /** Find removable hyphens within each line of text. The removal cases, in order: 1. "battle-field" should become "battlefield", and "vis-a" should become "visa" 2. "horse-buggy" should become "horse" then "buggy" since "horsebuggy" is not a word but "horse" and "buggy" are words separately. 3. "slo-mo" should be left as it is, since neither "slomo" nor "slo" is in the dictionary. */ var savedWord = "" //global temp var, may end with - def dehyphenate(ell: List[String]): List[String] = ell match { case Nil => if (savedWord == "" || savedWord == "-") Nil else { val tmp = savedWord savedWord = "" if (tmp.last == '-') (tmp.init)::"-"::Nil else tmp::Nil } case "-" :: rest => savedWord += "-"; dehyphenate(rest) case word2 :: rest => if (savedWord == "") { savedWord = word2 dehyphenate(rest) } else if (savedWord.last == '-') { val word1 = savedWord.init savedWord = word2 //reset for recursive call if (dict.contains(word1 + word2)) { println("Dehyphenating " + word1 + "-" + word2 + " to " + word1 + word2) dehyphenate((word1 + word2) :: rest) } else if (dict.contains(word1) && dict.contains(word2)) { println("Dehyphenating " + word1 + "-" + word2 + " to " + s"$word1 $word2") dehyphenate(word1 :: word2 :: rest) } else { println("Leaving " + word1 + "-" + word2 + " alone.") dehyphenate((word1 + "-" + word2) :: rest) } } else { //no hyphen, just swap saved word val word1 = savedWord savedWord = word2 //reset for recursive call word1 :: dehyphenate(rest) } } // Main body of client code similar to before var (maxLen,maxRow,maxCol,maxWord) = (-1,-1,-1,"") for (i <- matrix.indices) { val lineList = dehyphenate(matrix(i)) //does this copy the whole line, or just copy the reference? for (j <- lineList.indices) { if (lineList(j).length > maxLen) { //if (lineList(j).length >= maxLen) { maxLen = lineList(j).length maxRow = i maxCol = j maxWord = lineList(j) println(s"\nNew top word and line: $maxWord in " + matrix(i).toList) } } } val filep = new PrintWriter(new FileWriter("output.txt",true)); //appends filep.println(s"The word of longest length $maxLen is $maxWord in line $maxRow, column $maxCol") filep.close() //echo to screen too println(s"The word of longest length $maxLen is $maxWord in line $maxRow, column $maxCol") } /* Essays put here for convenience. Q1: This version of "dehyphenate" also takes hyphenated chains two-at-a-time. This means that on foo-bar-delta it will first apply the rules for "foo-bar". Then depending on which removal case applies, it will do 1. foobar-delta ... 2. foo :: bar-delta ... 3. "foo-bar"-delta with "foo-bar" now treated as one word. So on "comical-historical-pastoral" in JustHamlet.txt, it will give comical :: historical :: pastoral But there is one meaningful difference on how it handles multiple hyphens that are consecutive, i.e., dashes. In case of "devotion--that" in Gettysburg.txt, it repeats the case option for hyphen case "-" :: rest for both hyphens in a row, without trying the other code. Thus both hyphens get tacked on to the savedWord variable. That makes "devotion--that" be kept as a unit, and makes it be reported as the longest word in Gettysburg.txt. This is instead of "consecrate--" in my "MaxWordsKey1.scala". Of course, YMMV is fine---and frankly, probably neither of my keys is handling cases of two *consecutive* hyphens in the best way. And both screw up on *three* consecutive hyphens, i.e. on "cases---print" in SmallTest.txt. (Understanding this, not being perfect, is the goal.) Q2: The original dictionary has 466,551 words. When all words are lowercased, it becomes 466,547 words. That is, it shrinks by just 4 words! (Hence the Q reading "if at all"...) The words happen to be AS,as,Dino,dino,The,the,TO,to. The reason it shrinks is that (unlike List and Array) the Set datatype does not allow duplicates. */