{-# OPTIONS  #-}
-----------------------------------------------------------------------------
-- |
-- Module      : Language.Python.Common.StringEscape
-- Copyright   : (c) 2009 Bernie Pope 
-- License     : BSD-style
-- Maintainer  : bjpop@csse.unimelb.edu.au
-- Stability   : experimental
-- Portability : ghc
--
-- Conversion to/from escaped characters in strings. Note: currently does not
-- support escaped Unicode character names.
-- 
-- See:
-- 
--    * Version 2.6 <http://docs.python.org/2.6/reference/lexical_analysis.html#string-literals>
--  
--    * Version 3.1 <http://docs.python.org/3.1/reference/lexical_analysis.html#string-and-bytes-literals> 
-----------------------------------------------------------------------------

module Language.Python.Common.StringEscape ( 
   -- * String conversion. 
   unescapeString, 
   unescapeRawString,
   -- * Digits allowed in octal and hex representation.
   octalDigits,
   hexDigits) where

import Numeric (readHex, readOct)

-- | Convert escaped sequences of characters into /real/ characters in a normal Python string.

-- XXX does not handle escaped unicode literals
unescapeString :: String -> String
unescapeString :: String -> String
unescapeString (Char
'\\':Char
'\\':String
cs) = Char
'\\' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs -- Backslash (\)
unescapeString (Char
'\\':Char
'\'':String
cs) = Char
'\'' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs -- Single quote (')
unescapeString (Char
'\\':Char
'"':String
cs) = Char
'"' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs   -- Double quote (")
unescapeString (Char
'\\':Char
'a':String
cs) = Char
'\a' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs  -- ASCII Bell (BEL)
unescapeString (Char
'\\':Char
'b':String
cs) = Char
'\b' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs  -- ASCII Backspace (BS)
unescapeString (Char
'\\':Char
'f':String
cs) = Char
'\f' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs  -- ASCII Formfeed (FF)
unescapeString (Char
'\\':Char
'n':String
cs) = Char
'\n' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs  -- ASCII Linefeed (LF)
unescapeString (Char
'\\':Char
'r':String
cs) = Char
'\r' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs  -- ASCII Carriage Return (CR)
unescapeString (Char
'\\':Char
't':String
cs) = Char
'\t' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs  -- ASCII Horizontal Tab (TAB)
unescapeString (Char
'\\':Char
'v':String
cs) = Char
'\v' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs  -- ASCII Vertical Tab (VT)
unescapeString (Char
'\\':Char
'\n':String
cs) = String -> String
unescapeString String
cs        -- line continuation
unescapeString (Char
'\\':rest :: String
rest@(Char
o:String
_))
   | Char
o Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
octalDigits = Int -> String -> (String -> Int) -> String -> String
unescapeNumeric Int
3 String
octalDigits ((Int, String) -> Int
forall a b. (a, b) -> a
fst ((Int, String) -> Int)
-> (String -> (Int, String)) -> String -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Int, String)] -> (Int, String)
forall a. [a] -> a
head ([(Int, String)] -> (Int, String))
-> (String -> [(Int, String)]) -> String -> (Int, String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [(Int, String)]
forall a. (Eq a, Num a) => ReadS a
readOct) String
rest 
unescapeString (Char
'\\':Char
'x':rest :: String
rest@(Char
h:String
_))
   | Char
h Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
hexDigits = Int -> String -> (String -> Int) -> String -> String
unescapeNumeric Int
2 String
hexDigits ((Int, String) -> Int
forall a b. (a, b) -> a
fst ((Int, String) -> Int)
-> (String -> (Int, String)) -> String -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Int, String)] -> (Int, String)
forall a. [a] -> a
head ([(Int, String)] -> (Int, String))
-> (String -> [(Int, String)]) -> String -> (Int, String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [(Int, String)]
forall a. (Eq a, Num a) => ReadS a
readHex) String
rest 
unescapeString (Char
c:String
cs) = Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
cs 
unescapeString [] = []

{-
-- | This function is a placeholder for unescaping characters in raw strings. 
-- The Python documentation explicitly says that 
-- "When an 'r' or 'R' prefix is present, a character following a backslash is included 
-- in the string without change, and all backslashes are left in the string."
-- However it also says that When an 'r' or 'R' prefix is used in conjunction with
-- a 'u' or 'U' prefix, then the \uXXXX and \UXXXXXXXX escape sequences are processed
-- while all other backslashes are left in the string. Currently the function is the identity
-- but it ought to process unicode escape sequences.
-}

-- XXX does not handle escaped unicode literals
unescapeRawString :: String -> String
unescapeRawString :: String -> String
unescapeRawString = String -> String
forall a. a -> a
id

{-
-- | Convert escaped sequences of characters into /real/ characters in a raw Python string.
-- Note: despite their name, Python raw strings do allow a small set of character escapings,
-- namely the single and double quote characters and the line continuation marker.
unescapeRawString ('\\':'\'':cs) = '\'' : unescapeRawString cs -- Single quote (')
unescapeRawString ('\\':'"':cs) = '"' : unescapeRawString cs -- Double quote (")
unescapeRawString ('\\':'\n':cs) = unescapeRawString cs -- line continuation
unescapeRawString (c:cs) = c : unescapeRawString cs
unescapeRawString [] = []
-}

{- 
   This is a bit complicated because Python allows between 1 and 3 octal
   characters after the \, and 1 and 2 hex characters after a \x.
-}
unescapeNumeric :: Int -> String -> (String -> Int) -> String -> String
unescapeNumeric :: Int -> String -> (String -> Int) -> String -> String
unescapeNumeric Int
n String
numericDigits String -> Int
readNumeric String
str
   = Int -> String -> String -> String
forall {t}. (Eq t, Num t) => t -> String -> String -> String
loop Int
n [] String
str 
   where
   loop :: t -> String -> String -> String
loop t
_ String
acc [] = [String -> Char
numericToChar String
acc]
   loop t
0 String
acc String
rest
      = String -> Char
numericToChar String
acc Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString String
rest
   loop t
n String
acc (Char
c:String
cs)
      | Char
c Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
numericDigits = t -> String -> String -> String
loop (t
nt -> t -> t
forall a. Num a => a -> a -> a
-t
1) (Char
cChar -> String -> String
forall a. a -> [a] -> [a]
:String
acc) String
cs
      | Bool
otherwise = String -> Char
numericToChar String
acc Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
unescapeString (Char
cChar -> String -> String
forall a. a -> [a] -> [a]
:String
cs)
   numericToChar :: String -> Char
   numericToChar :: String -> Char
numericToChar = Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> (String -> Int) -> String -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Int
readNumeric (String -> Int) -> (String -> String) -> String -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse

octalDigits, hexDigits :: String
-- | The set of valid octal digits in Python.
octalDigits :: String
octalDigits = String
"01234567"
-- | The set of valid hex digits in Python.
hexDigits :: String
hexDigits = String
"0123456789abcdef"