diff --git a/Main.java b/Main.java index 751386f..60d9106 100644 --- a/Main.java +++ b/Main.java @@ -1,9 +1,14 @@ package org.duangsuse.telegramscanner; +import org.duangsuse.telegramscanner.model.Message; +import org.duangsuse.telegramscanner.scanner.Scanner; import org.duangsuse.telegramscanner.scanner.Utf8LineInputStream; import java.io.IOException; import java.io.PrintStream; +import java.util.Arrays; + +import java.util.List; /** * Application main class @@ -27,7 +32,7 @@ private Main() {} /** * Standard input */ - private static PrintStream err = System.err; + public static PrintStream err = System.err; /** * Program entrance @@ -36,7 +41,18 @@ private Main() {} */ public static void main(String... args) { err.print("TelegramScanner version "); err.println(VERSION); + List argList = Arrays.asList(args); + + if (argList.contains("-test")) + testInput(); + + for (Message stringMessage : new Scanner(System.in)) { + out.print(stringMessage.toString()); + } + } + @SuppressWarnings("WeakerAccess") + public static void testInput() { Utf8LineInputStream input = new Utf8LineInputStream(System.in); String line = ""; diff --git a/helper/Strings.java b/helper/Strings.java index a9a28ea..8dc1f09 100644 --- a/helper/Strings.java +++ b/helper/Strings.java @@ -1,5 +1,7 @@ package org.duangsuse.telegramscanner.helper; +import org.jetbrains.annotations.Contract; + /** * String helper program */ @@ -11,8 +13,9 @@ public final class Strings { * @param str target string * @return if str.length greater than n, then sub-sequence str, else return str */ + @Contract("_, null -> !null") public static String take(int n, String str) { - if (n == 0 || str.length() == 0) + if (str == null || n == 0 || str.length() == 0) return ""; if (str.length() > n) diff --git a/model/Message.java b/model/Message.java index 101ac8d..6210977 100644 --- a/model/Message.java +++ b/model/Message.java @@ -1,6 +1,7 @@ package org.duangsuse.telegramscanner.model; import org.duangsuse.telegramscanner.helper.Strings; +import org.duangsuse.telegramscanner.sourcemanager.Identifiable; import java.util.Collection; import java.util.LinkedList; @@ -17,7 +18,7 @@ *
  • hashTags * */ -public class Message { +public class Message implements Identifiable { private MessageHeaderType headerType = MessageHeaderType.NORMAL; /** * Message header (name, publishedAt) @@ -47,15 +48,21 @@ public class Message { */ public Message() {} + public Message(MessageHead head, MessageHeaderType type, T extRef) { + this.header = head; + this.headerType = type; + this.messageExtRef = extRef; + } + @Override public String toString() { // count links and hashtags final StringBuilder desc = new StringBuilder(); - if (links.size() != 0) desc.append(links.size()).append(" links"); + if (links.size() != 0) desc.append(links.size()).append(" links, "); if (hashtags.size() != 0) desc.append(hashtags.size()).append(" tags"); final String fmt = "Message{Hd%s, Bd%s, ext=%s}[%s](%s..., %s)"; - return String.format(fmt, headerType, bodyType, messageExtRef.toString(), header, Strings.take(BODY_PREVIEW_LEN, messageBody), desc); + return String.format(fmt, headerType, bodyType, messageExtRef.toString(), header.toString(), Strings.take(BODY_PREVIEW_LEN, messageBody), desc); } @Override @@ -87,6 +94,11 @@ public int hashCode() { return result; } + @Override + public int getIdentity() { + return System.identityHashCode(this); + } + public MessageHeaderType getHeaderType() { return headerType; } diff --git a/model/MessageHeaderType.java b/model/MessageHeaderType.java index 8f31445..8afc125 100644 --- a/model/MessageHeaderType.java +++ b/model/MessageHeaderType.java @@ -12,7 +12,7 @@ public enum MessageHeaderType { /** * Reply to message */ - RELPY, + REPLY, /** * Forwarded message */ diff --git a/scanner/Scanner.java b/scanner/Scanner.java new file mode 100644 index 0000000..7ea1a06 --- /dev/null +++ b/scanner/Scanner.java @@ -0,0 +1,384 @@ +package org.duangsuse.telegramscanner.scanner; + +import org.duangsuse.telegramscanner.Main; +import org.duangsuse.telegramscanner.helper.Strings; +import org.duangsuse.telegramscanner.model.*; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.io.InputStream; +import java.util.*; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.regex.Matcher; + +/** + * Telegram message scanner + *
    + * scanner is an iterable object (stream), calling it's next method will + * result a new message object (if successful) and move line pointer to message + * text end (prepare to read next message) or null (if EOS occur) + * + * @see org.duangsuse.telegramscanner.model.MessageHead + * @see org.duangsuse.telegramscanner.model.Message scanner stream + * + * @author duangsuse + */ +public class Scanner extends Utf8LineInputStream implements Iterable> { + private ScannerState state = ScannerState.EXPECT_MESSAGE; + + private int offset, line, messageNo, localLine; + + private String lastLine; + private Message lastMessage; + + // used in lambda + private MessageHead $lastHead; + // used in lambda + private String $extRef; + private MessageHeaderType $lastHeadType = MessageHeaderType.NORMAL; + private MessageBodyType $lastBodyType; + // used in lambda + private final StringBuffer bodyBuffer = new StringBuffer(); + private List $hashtags = new LinkedList<>(), $links = new LinkedList<>(), + $bareLinks = new LinkedList<>(), $inlineLinks = new LinkedList<>(); + + private boolean keepLineOnce = false; + + public Scanner(InputStream is) { + super(is); + } + + /* Temporary field */ + //private String scanningHeadName; + //private Date scanningHeadDate; + + // and tgType, tgExtRef, tgBodyType, tgBody + + @NotNull + @Override + public Iterator> iterator() { + return new Iterator>() { + @Override + public boolean hasNext() { + return nextLine() != null; + // must be called before next(), moving data(line) pointer to next line + } + + @Override + public Message next() { + lastMessage = new Message<>(); + + String tgName = ""; + Date published = null; + + // now, scan them + switch (state) { + case EXPECT_MESSAGE: + lineDoUntil((s) -> RegexConstants.MESSAGE_HEAD.matcher(s).matches(), (m) -> + scannerWarn("Ignoring line " + m)); + //scannerWarn("INFO", lastLine); + //now message header is in lastLine + + // set tgName and tgHeader + Matcher m = RegexConstants.MESSAGE_HEAD.matcher(lastLine); + m.reset(); + assert m.matches(): "Checked lastLine should be matched regex pattern"; + if (m.matches()) + tgName = m.group(1); + + final String dd = m.group(2); + final String MM = m.group(3); + final String yy = m.group(4); + final String hh = m.group(5); + final String mm = m.group(6); + + try { + published = new Date((Integer.parseInt(yy) + 2000) - 1900, Integer.parseInt(MM) - 1, Integer.parseInt(dd), Integer.parseInt(hh), Integer.parseInt(mm)); + } catch (NumberFormatException e) { + scannerWarn("Bad number format, " + e.getMessage()); + published = new Date(); + } + + scannerInfo("Begin scan header, " + String.format("name: %s, date: %s", tgName, published)); + state = ScannerState.SCAN_HEAD; + + /* fail through */ + case SCAN_HEAD: + String finalTgName = tgName; + Date finalPublished = published; + + nextLine(); + + lineDoWhile((it) -> it.startsWith("["), (line) -> { + // check special matches, set tgType, tgExtRef, tgBodyType + Matcher match; + MessageHeaderType type = MessageHeaderType.NORMAL; + String extRef = ""; + + if (RegexConstants.HEAD_FORWARD.matcher(line).matches()) { + match = RegexConstants.HEAD_FORWARD.matcher(line); + type = MessageHeaderType.FORWARDED; + if (match.find()) + extRef = match.group(1); + + } else if (RegexConstants.HEAD_REPLY.matcher(line).matches()) { + match = RegexConstants.HEAD_REPLY.matcher(line); + type = MessageHeaderType.REPLY; + if (match.find()) + extRef = match.group(1); + + } else if (RegexConstants.HEAD_FILE.matcher(line).matches()) { + match = RegexConstants.HEAD_FILE.matcher(line); + type = MessageHeaderType.HAS_FILE; + if (match.find()) + extRef = match.group(1); + } else if (RegexConstants.HEAD_STICKER.matcher(line).matches()) { + match = RegexConstants.HEAD_STICKER.matcher(line); + type = MessageHeaderType.IS_STICKER; + if (match.find()) /* certainly */ + extRef = match.group(1); + } else if (RegexConstants.HEAD_IS_ALBUM.matcher(line).matches()) { + type = MessageHeaderType.A_ALBUM; + } else if (RegexConstants.HEAD_IS_PHOTO.matcher(line).matches()) { + type = MessageHeaderType.A_PHOTO; + } + + if (type == MessageHeaderType.FORWARDED) + $lastHead = new ForwardedMessageHead(finalTgName, finalPublished, extRef); + if (type == MessageHeaderType.REPLY) + $lastHead = new RepliedMessageHead(finalTgName, finalPublished, extRef); + + $lastHead = new MessageHead(finalTgName, finalPublished); + $extRef = extRef; + $lastHeadType = type; + }); + + if ($lastHead == null) + $lastHead = new MessageHead(tgName, published); + + if ($extRef == null) + $extRef = String.valueOf(""); + + scannerInfo("Break; Scanning message body, " + $lastHeadType + "~" + $lastHead.toString() + ", E:" + $extRef); + state = ScannerState.SCAN_BODY; + + /* fail through */ + case SCAN_BODY: + lineDoUntil((it) -> it.endsWith("]") && RegexConstants.MESSAGE_HEAD.matcher(it).matches(), (line) -> { + // check label links, inline links, plain links, hashtags + // and read body text + + Matcher tagsMatcher = RegexConstants.MessageBodyRegexConstants.HASHTAG.matcher(line); + Matcher inlineMatcher = RegexConstants.MessageBodyRegexConstants.LINK_INLINED.matcher(line); + Matcher bareMatcher = RegexConstants.MessageBodyRegexConstants.LINK_BARE.matcher(line); + Matcher markdownMatcher = RegexConstants.MessageBodyRegexConstants.LINK_TELEGRAM.matcher(line); + + matchTextPart(tagsMatcher, $hashtags, 2); + matchTextPart(inlineMatcher, $inlineLinks, 1, 2); + matchTextPart(bareMatcher, $bareLinks, 1, 2); + matchTextPart(markdownMatcher, $links, 1, 2); + + bodyBuffer.append(line); + }); + keepLineOnce = true; // keep message head line + + if ($links.size() + $inlineLinks.size() + $bareLinks.size() != 0) { + if ($hashtags.isEmpty()) $lastBodyType = MessageBodyType.HAS_LINKS; + else $lastBodyType = MessageBodyType.HAS_LINKS_AND_HASTAGS; + } else if (!$hashtags.isEmpty()) + $lastBodyType = MessageBodyType.HAS_HASHTAGS; + else $lastBodyType = MessageBodyType.NORMAL; + + lastMessage = new Message<>($lastHead, $lastHeadType, $extRef); + lastMessage.setBodyType($lastBodyType); + lastMessage.setMessageBody(bodyBuffer.toString()); + + lastMessage.getLinks().addAll($links); + lastMessage.getLinks().addAll($inlineLinks); + lastMessage.getLinks().addAll($bareLinks); + + lastMessage.getHashtags().addAll($hashtags); + + $links.clear(); + $hashtags.clear(); + $bareLinks.clear(); + $inlineLinks.clear(); + + // clear + if (bodyBuffer.length() != 0) + bodyBuffer.delete(0, bodyBuffer.length()); + + scannerInfo("Break; Scanning new message"); + state = ScannerState.EXPECT_MESSAGE; + break; + } + + ++messageNo; + localLine = 0; + return lastMessage; + } + }; + } + + /** + * Match text part using {@link Matcher}, collecting groups + * + * @param matcher text matcher + * @param dst destination collection + * @param groups to be collected (and concatenated) + */ + private void matchTextPart(@NotNull Matcher matcher, Collection dst, int... groups) { + //if (matcher.matches()) + // for (int i = 1; i < matcher.groupCount(); i++) + // dst.add(matcher.group(i).trim()); + + while (matcher.find()) { + StringBuilder sb = new StringBuilder(); + for (int i: groups) { + sb.append(matcher.group(i)); + } + dst.add(sb.toString()); + } + } + + /** + * messageBody toString preview length + */ + private static final int BODY_PREVIEW_LEN = 10; + + /** + * Call with default tag "WARN" + * + * @param message warning message + */ + protected void scannerWarn(String message) { scannerLog("WARN", message); } + /** + * Call with tag "INFO" + * + * @param message warning message + */ + protected void scannerInfo(String message) { scannerLog("INFO", message); } + /** + * Output a warning log message + * + * @param tag tag to be applied + * @param message message to output + */ + protected void scannerLog(final String tag, String message) { + Main.err.print(tag); + Main.err.print(": "); + Main.err.print(state); + Main.err.print("@" + offset + "(" + line + ")"); + Main.err.print(String.format(": M#%d:%d, %s...", messageNo, localLine, Strings.take(BODY_PREVIEW_LEN, lastLine))); + Main.err.println(message); + } + + /** + * Do something while predicate is true + *
    + * lastLine must not null before calling + * + * @param predicate predicate to be called with current line + * @param action result action to be called when predicate is true + */ + @SuppressWarnings("WeakerAccess") + protected void lineDoWhile(Function predicate, Consumer action) { + while (lastLine != null && predicate.apply(lastLine)) { + action.accept(lastLine); + nextLine(); + } + } + + /** + * Do something while predicate is false + *
    + * lastLine must not null before calling + * + * @param predicate predicate to be called with current line + * @param action result action to be called when predicate is false + */ + protected void lineDoUntil(final Function predicate, Consumer action) { + lineDoWhile((s) -> !predicate.apply(s), action); + } + + @Override + public String toString() { + return String.format("TelegramScanner(F:%sS:%s,Off=%d,Line=%d,No=%d:%d@%s)", + super.toString(), state, offset, line, messageNo, localLine, lastMessage); + } + + /** + * Ensure one-line-readable line stream + * + * @see this#lastLine "buffered" line + * @return next line if succeeded, null otherwise + */ + protected String nextLine() { + ++localLine; + try { + if (lastLine == null) + return readLine(); + + if (!keepLineOnce) { + readLine(); + } else keepLineOnce = false; // line kept + + return lastLine; + } catch (IOException ignored) { return null; } + } + + @Override + public String readLine() throws IOException { + String read = super.readLine(); + lastLine = read; + line++; + if (read != null) + offset += read.length(); + return read; + } + + public ScannerState getState() { + return state; + } + + public int getOffset() { + return offset; + } + + public int getLine() { + return line; + } + + public int getMessageNo() { + return messageNo; + } + + public int getLocalLine() { + return localLine; + } + + public Message getLastMessage() { + return lastMessage; + } + + /** + * Scanner state + * + * @see Scanner + */ + public enum ScannerState { + /** + * Message reader entrance + */ + EXPECT_MESSAGE, + /** + * Scanning message header (may read special tags) + */ + SCAN_HEAD, + /** + * Scanning message body (may read links) + */ + SCAN_BODY + } +} diff --git a/scanner/Utf8LineInputStream.java b/scanner/Utf8LineInputStream.java index 2ff6b23..5ecb400 100644 --- a/scanner/Utf8LineInputStream.java +++ b/scanner/Utf8LineInputStream.java @@ -1,5 +1,7 @@ package org.duangsuse.telegramscanner.scanner; +import org.jetbrains.annotations.Contract; + import java.io.*; import java.nio.charset.Charset; @@ -16,7 +18,7 @@ public class Utf8LineInputStream implements Closeable { private DataInput target; /** - * Construct using data input {@link DataInputStream} + * Construct using data input {@link DataInputStream}, no wrappers used * * @param dataIn data input instance */ @@ -25,7 +27,7 @@ public Utf8LineInputStream(DataInput dataIn) { } /** - * Construct using byte input stream + * Construct using byte input stream, wrapped by UTF-8 reader * * @see InputStreamReader with utf-8 decoder support * @param in line-based input stream @@ -42,6 +44,7 @@ public int read() throws IOException { this.target = new DataInputStream(is); } + @Contract(value = "null -> false", pure = true) @Override public boolean equals(Object o) { if (this == o) return true;